#################################################################
#Functions and code to split bill sections from full bill downloaded as txt file
#Section splits are for 111th Congress, but can
#be updated for any congress/chamber 
#
#Last update: April 21, 2023

#Authors: Karen Simpson and Jeremy Gelman
#################################################################

get_section_titles <- function(text) {
  first_mark <- gregexpr(
    "(\nSECTION [0-9]{1,10}\\.|\nSECTION [0-9]{1,10}) {1,}(([A-Z]*\\b){1,10}|([A-Z]*.\\b){1,10})", text
  )
  if (as.numeric(first_mark[[1]][1]) < 0) {
    first_mark <- gregexpr(
      "\nSection[0-9]{1,10}\\. {1,}[A-Z]", text
    )
  }
  marks <- gregexpr("(\nSEC\\. [0-9]{1,}\\.|\nSEC\\. [0-9]{1,}) {1,}(([A-Z]*\\b){1,10}|([A-Z]*.\\b){1,10})", text)
  if (as.numeric(marks[[1]][1]) < 0) {
    marks <- gregexpr("(\nSec\\. [0-9]{1,}\\.|\nSec\\. [0-9]{1,}) {1,}[A-Z]", text)
  }
  if (as.numeric(marks[[1]][1]) < 0) {
    marks <- gregexpr("(\nSECTION [0-9]{1,}\\.|\nSECTION [0-9]{1,}) {1,}(([A-Z]*\\b){1,10}|([A-Z]*.\\b){1,10})", text)
  }
  if (as.numeric(marks[[1]][1]) < 0) {
    marks <- gregexpr("\nSection [0-9]{1,}\\. {1,}[A-Z]", text)
  }
  if (as.numeric(marks[[1]][1]) != -1 & as.numeric(marks[[1]][1])!=as.numeric(first_mark[[1]])[1]) {
    sec_title_start <- c(as.numeric(first_mark[[1]])[1], as.numeric(marks[[1]]))
    sec_title_len <- c(attr(first_mark[[1]], "match.length")[1],
                       attr(marks[[1]], "match.length"))
  } else if (as.numeric(marks[[1]][1]) != -1 & as.numeric(first_mark[[1]] == -1)) {
    sec_title_start <- as.numeric(marks[[1]])
    sec_title_len <- attr(marks[[1]], "match.length")
  } else {
    sec_title_start <- as.numeric(first_mark[[1]])
    sec_title_len <- attr(first_mark[[1]], "match.length")
  }
  sec_title_end <- sec_title_start + sec_title_len
  sec_titles <- as.character(sapply(1:length(sec_title_start), function(x)
    substring(text, sec_title_start[x], sec_title_end[x])))
  if (length(sec_title_start) > 1) {
    sec_ends <- sec_title_start[2] - 1
    j <- 2
    while (sec_ends < sec_title_start[1]) {
      j <- j + 1
      sec_ends <- sec_title_start[j] - 1
    }
    for (i in 2:length(sec_title_start)) {
      if (length(sec_title_start) > i) {
        new_sec_ends <- sec_title_start[i + 1] - 1
        z <- i + 1
        while (new_sec_ends < sec_title_start[i]) {
          z <- z + 1
          new_sec_ends <- sec_title_start[z] - 1 
        }
        sec_ends <- c(sec_ends, new_sec_ends)
      } else {
        sec_ends <- c(sec_ends, nchar(text))
      }
    }
  } else {
    sec_ends <- nchar(text)
  }
  output <- data.frame(
    title = sec_titles,
    start_i = sec_title_start,
    end_i = sec_ends,
    title_end_i = (sec_title_end - 3),
    pre_sec = substring(text, sec_title_start - 2, sec_title_start - 1)
  )
  return(output)
}


##GET SECTIONS FUNCTION II

get_section_titles_2 <- function(text) {
  marks <- gregexpr("(\nSEC\\. [0-9]{1,}\\.|\nSEC\\. [0-9]{1,}|\nSec\\. [0-9]{1,}\\.|\nSec\\. [0-9]{1,}|  SEC\\. [0-9]{1,}\\.|  SEC\\. [0-9]{1,}|  Sec\\. [0-9]{1,}\\.|  Sec\\. [0-9]{1,}|\nSECTION [0-9]{1,10}\\.|\nSECTION [0-9]{1,10}|\nSection [0-9]{1,10}\\.|  SECTION [0-9]{1,10}\\.|  SECTION [0-9]{1,10}|  Section [0-9]{1,10}\\.) {1,}(([A-Z]*\\b){1,10}|([A-Z]*.\\b){1,10})",text)
  sec_title_start <- as.numeric(marks[[1]])
  sec_title_len <- attr(marks[[1]], "match.length")
  sec_title_end <- sec_title_start + sec_title_len
  sec_titles <- as.character(sapply(1:length(sec_title_start), function(x)
    substring(text, sec_title_start[x], sec_title_end[x])))
  if (length(sec_title_start) > 1) {
    sec_ends <- sec_title_start[2] - 1
    j <- 2
    while (sec_ends < sec_title_start[1]) {
      j <- j + 1
      sec_ends <- sec_title_start[j] - 1
    }
    for (i in 2:length(sec_title_start)) {
      if (length(sec_title_start) > i) {
        new_sec_ends <- sec_title_start[i + 1] - 1
        z <- i + 1
        while (new_sec_ends < sec_title_start[i]) {
          z <- z + 1
          new_sec_ends <- sec_title_start[z] - 1 
        }
        sec_ends <- c(sec_ends, new_sec_ends)
      } else {
        sec_ends <- c(sec_ends, nchar(text))
      }
    }
  } else {
    sec_ends <- nchar(text)
  }
  output <- data.frame(
    title = sec_titles,
    start_i = sec_title_start,
    end_i = sec_ends,
    title_end_i = (sec_title_end - 3),
    pre_sec = substring(text, sec_title_start - 2, sec_title_start - 1)
  )
  return(output)
}

#PACKAGES

library(stringr)
library(dplyr)

##THE 111TH CONGRESS
##THE 111TH CONGRESS HOUSE

input_dir <- "replication/bills/"
base_dir <- "/replication/"
output_dir <- "replication/sections/111"

library(stringr)
library(dplyr)

setwd(input_dir)

#CHECK FOR MISSING FILES

ls <- list.files()
ls_df <- data.frame(ls)
ls_df$output <- ""
ls_df <- ls_df[is.na(ls_df$ls)==FALSE,]
ls_df$done <- ""
ls_df$name <- ""
ls_df$type <- ""
ls_df$char <- 0


for(x in 1:length(ls_df$ls)){
  
  billname <- gsub("BILLS-", "", ls_df$ls[x])
  vers <- str_extract(billname, "[A-z]{1,5}[0-9]{0,}.txt")
  billname <- gsub(vers,"", billname)
  billname <- paste0(billname,"_",vers)
  billname<- gsub(".txt","",billname)
  
  billtype <- gsub("_[A-z]*", "", billname)
  billtype <- gsub("[0-9]*","",billtype)
  
  ls_df$name[x] <- billname
  ls_df$type[x] <- billtype
}

#RUN THE LOOP

setwd(input_dir)

for (j in 1:length(ls_df[,1])) {
  doc <- paste0(readLines(paste0(input_dir,ls_df[j,1])), collapse = "\n")
  ls_df$done[j] <- 1
  ls_df$char[j] <- nchar(doc)
  text <- doc
  out <- tryCatch(
    {
      #This is if we want sections removed
      
      text <- gsub("\\([0-9]{1,}\\)[s|S]", "S", text)
      
      end <- gregexpr("(&lt;all&gt;|Passed the [House|Senate]|Speaker of the House of Representatives[.]|Attest:)", text)
      
      if(end[[1]][1]>0){
        len <- length(end[[1]])
        text <- substr(text, 1, end[[1]][len]-1)
      }
      
      front =gregexpr("(Resolved, |  JOINT RESOLUTION|  CONCURRENT RESOLUTION|  AMENDMENT|\nAMENDMENT|  A BILL|\nA BILL| RESOLUTION|  AN ACT|\nAN ACT|  Joint Resolution|  Concurrent Resolution|  Amendment|\nAmendment|  A Bill|\nA Bill|  An Act|\nAn Act|  Resolution)", text)
      if(front[[1]][1]>0){
        text <- substr(text, (front[[1]][1]+attr(front[[1]], "match.length")[1]), nchar(text))
      }
      
      text <- gsub("-[A-z]{1,}|[A-z]{1,}-","", text)
      text <- gsub("-[0-9]{1,}|[0-9]{1,}-","", text)
      text <- gsub("(SEC.|Sec.) -. ", "SEC. ", text)
      
      text <- gsub("&lt;DELETED&gt(.*?)&lt;/DELETED&gt;", "",text)
      
      sections <- get_section_titles(text) %>%
        mutate(content = grepl("TABLE OF CONTENT", title),
               to_remove = ifelse(content == TRUE, 1, 0))
      
      orig_text <- text
      sec_i_to_remove <- which(sections$to_remove == 1)
      for (i in sec_i_to_remove) {
        start <- sections$start_i[i]
        end <- sections$end_i[i]
        txt_to_rm <- substring(orig_text, start, end)
        text <- gsub(txt_to_rm, "", text, fixed = TRUE)
      }
      
      #Here's where the splitting starts
      remaining_sections <- get_section_titles_2(text)
      
      if(nrow(remaining_sections)>1&remaining_sections$title[1]==""){
        remaining_sections <- remaining_sections[2:nrow(remaining_sections),]
      }
      
      for(x in length(remaining_sections$pre_sec)){
        if(grepl("`",remaining_sections$pre_sec[x])==TRUE) {
          remaining_sections$end_i[x-1] <- remaining_sections$end_i[x]
        }
      }
      
      remaining_sections <- subset(remaining_sections, grepl("`",pre_sec)!=TRUE) 
      
      remaining_sections$name <- ""
      
      for (i in 1:length(remaining_sections$title)){
        if(grepl("SECTION [0-9]{1,}", remaining_sections$title[i])==TRUE){
          remaining_sections$name[i] <- paste0("SEC.", str_extract(remaining_sections$title[i], " [0-9]{1,}"))
        }
        if(grepl("Section [0-9]{1,}", remaining_sections$title[i])==TRUE){
          remaining_sections$name[i] <- paste0("SEC.", str_extract(remaining_sections$title[i], " [0-9]{1,}"))
        }
        if(grepl("SEC\\. [0-9]{1,}", remaining_sections$title[i])==TRUE){
          remaining_sections$name[i] <- str_extract(remaining_sections$title[i], "SEC\\. [0-9]{1,}")
        }
        if(grepl("Sec\\. [0-9]{1,}", remaining_sections$title[i])==TRUE){
          remaining_sections$name[i] <- str_extract(remaining_sections$title[i], "Sec\\. [0-9]{1,}")
        }
      }
      
      remaining_sections$name <- tolower(remaining_sections$name)
      remaining_sections$name <- gsub(" ","",remaining_sections$name)
      
      secs <- length(remaining_sections$name)
      ls_df$num_secs[j] <- secs
      
      orig_text2 <- text
      for (i in 1:nrow(remaining_sections)) {
        start <- remaining_sections$start_i[i]
        end <- remaining_sections$end_i[i]
        length <- nrow(remaining_sections)
        if (start > -1) {
          section <- substring(orig_text2, start, end)
          write.table(section, file=paste0(output_dir, "/", ls_df$name[j], "_", remaining_sections$name[i], ".txt"))
        } else if (start==0 & length>1) {
          section <- substring(orig_text2, start, end)
          write.table(section, file=paste0(output_dir, "/", ls_df$name[j], "_", remaining_sections$name[i], ".txt"))
        } else if (start<=0 & length==1){
          write.table(orig_text2, file=paste0(output_dir,"/", ls_df$name[j], "_", "NA",".txt"))
        }
      }
    },
    error = function(e) "error"
  )
  if (!is.null(out)) {
    if (out == "error") {
      ls_df$output[j] <- "error"
    }
  }
}

setwd(base_dir)

write.csv(ls_df, file="tracker111_hr.csv")

#FOR THE 111TH CONGRESS SENATE

setwd(base_dir)

input_dir <- "replication/bills/"
base_dir <- "/replication/"
output_dir <- "replication/sections/111"

#CHECK FOR MISSING FILES

setwd(input_dir)

ls <- list.files()
ls_df <- data.frame(ls)
ls_df$output <- ""
ls_df <- ls_df[is.na(ls_df$ls)==FALSE,]
ls_df$done <- ""
ls_df$name <- ""
ls_df$type <- ""
ls_df$char <- 0


for(x in 1:length(ls_df$ls)){
  
  billname <- gsub("BILLS-", "", ls_df$ls[x])
  vers <- str_extract(billname, "[A-z]{1,5}[0-9]{0,}.txt")
  billname <- gsub(vers,"", billname)
  billname <- paste0(billname,"_",vers)
  billname<- gsub(".txt","",billname)
  
  billtype <- gsub("_[A-z]*", "", billname)
  billtype <- gsub("[0-9]*","",billtype)
  
  ls_df$name[x] <- billname
  ls_df$type[x] <- billtype
}

#RUN THE LOOP

setwd(input_dir)

#Run the loop
for (j in 1:length(ls_df[,1])) {
  doc <- paste0(readLines(paste0(input_dir,ls_df[j,1])), collapse = "\n")
  ls_df$done[j] <- 1
  ls_df$char[j] <- nchar(doc)
  text <- doc
  out <- tryCatch(
    {
      #This is if we want sections removed
      
      text <- gsub("\\([0-9]{1,}\\)[s|S]", "S", text)
      
      end <- gregexpr("(&lt;all&gt;|Passed the [House|Senate]|Speaker of the House of Representatives[.]|Attest:)", text)
      
      if(end[[1]][1]>0){
        len <- length(end[[1]])
        text <- substr(text, 1, end[[1]][len]-1)
      }
      
      front =gregexpr("(Resolved, |  JOINT RESOLUTION|  CONCURRENT RESOLUTION|  AMENDMENT|\nAMENDMENT|  A BILL|\nA BILL| RESOLUTION|  AN ACT|\nAN ACT|  Joint Resolution|  Concurrent Resolution|  Amendment|\nAmendment|  A Bill|\nA Bill|  An Act|\nAn Act|  Resolution)", text)
      if(front[[1]][1]>0){
        text <- substr(text, (front[[1]][1]+attr(front[[1]], "match.length")[1]), nchar(text))
      }
      
      text <- gsub("-[A-z]{1,}|[A-z]{1,}-","", text)
      text <- gsub("-[0-9]{1,}|[0-9]{1,}-","", text)
      text <- gsub("(SEC.|Sec.) -. ", "SEC. ", text)
      
      text <- gsub("&lt;DELETED&gt(.*?)&lt;/DELETED&gt;", "",text)
      
      sections <- get_section_titles(text) %>%
        mutate(content = grepl("TABLE OF CONTENT", title),
               to_remove = ifelse(content == TRUE, 1, 0))
      
      orig_text <- text
      sec_i_to_remove <- which(sections$to_remove == 1)
      for (i in sec_i_to_remove) {
        start <- sections$start_i[i]
        end <- sections$end_i[i]
        txt_to_rm <- substring(orig_text, start, end)
        text <- gsub(txt_to_rm, "", text, fixed = TRUE)
      }
      
      #Here's where the splitting starts
      remaining_sections <- get_section_titles_2(text)
      
      if(nrow(remaining_sections)>1&remaining_sections$title[1]==""){
        remaining_sections <- remaining_sections[2:nrow(remaining_sections),]
      }
      
      for(x in length(remaining_sections$pre_sec)){
        if(grepl("`",remaining_sections$pre_sec[x])==TRUE) {
          remaining_sections$end_i[x-1] <- remaining_sections$end_i[x]
        }
      }
      
      remaining_sections <- subset(remaining_sections, grepl("`",pre_sec)!=TRUE) 
      
      remaining_sections$name <- ""
      
      for (i in 1:length(remaining_sections$title)){
        if(grepl("SECTION [0-9]{1,}", remaining_sections$title[i])==TRUE){
          remaining_sections$name[i] <- paste0("SEC.", str_extract(remaining_sections$title[i], " [0-9]{1,}"))
        }
        if(grepl("Section [0-9]{1,}", remaining_sections$title[i])==TRUE){
          remaining_sections$name[i] <- paste0("SEC.", str_extract(remaining_sections$title[i], " [0-9]{1,}"))
        }
        if(grepl("SEC\\. [0-9]{1,}", remaining_sections$title[i])==TRUE){
          remaining_sections$name[i] <- str_extract(remaining_sections$title[i], "SEC\\. [0-9]{1,}")
        }
        if(grepl("Sec\\. [0-9]{1,}", remaining_sections$title[i])==TRUE){
          remaining_sections$name[i] <- str_extract(remaining_sections$title[i], "Sec\\. [0-9]{1,}")
        }
      }
      
      remaining_sections$name <- tolower(remaining_sections$name)
      remaining_sections$name <- gsub(" ","",remaining_sections$name)
      
      secs <- length(remaining_sections$name)
      ls_df$num_secs[j] <- secs
      
      orig_text2 <- text
      for (i in 1:nrow(remaining_sections)) {
        start <- remaining_sections$start_i[i]
        end <- remaining_sections$end_i[i]
        length <- nrow(remaining_sections)
        if (start > -1) {
          section <- substring(orig_text2, start, end)
          write.table(section, file=paste0(output_dir, "/", ls_df$name[j], "_", remaining_sections$name[i], ".txt"))
        } else if (start==0 & length>1) {
          section <- substring(orig_text2, start, end)
          write.table(section, file=paste0(output_dir, "/", ls_df$name[j], "_", remaining_sections$name[i], ".txt"))
        } else if (start<=0 & length==1){
          write.table(orig_text2, file=paste0(output_dir,"/", ls_df$name[j], "_", "NA",".txt"))
        }
      }
    },
    error = function(e) "error"
  )
  if (!is.null(out)) {
    if (out == "error") {
      ls_df$output[j] <- "error"
    }
  }
}

setwd(base_dir)

write.csv(ls_df, file="tracker111_s.csv")

